#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 
# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
# 

"""
File: md.py
Author: zhangyang(zhangyang40@baidu.com)
Date: 2017/10/30 11:31
"""
# 建模与预测
"""
实际上在上述特征构造好了之后，我们有很多的办法去训练得到模型和完成预测，这里用了sklearn中的SGDClassifier 事实上xgboost有更好的效果（显然我们的特征大多是密集型的浮点数，很适合GBDT这样的模型）
注意交叉验证，我们这里用了10折的交叉验证

"""
from __future__ import division

import math

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier


def train():
    """
    在我们得到的特征上训练分类器，target为1(感兴趣)，或者是0(不感兴趣)
    """
    trainDf = pd.read_csv("data_train.csv")
    X = np.matrix(pd.DataFrame(trainDf, index=None,
                               columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
                                        "user_pop", "frnd_infl", "evt_pop"]))
    y = np.array(trainDf.interested)
    clf = SGDClassifier(loss="log", penalty="l2")
    clf.fit(X, y)
    return clf


def validate():
    """
    10折的交叉验证，并输出交叉验证的平均准确率
    """
    trainDf = pd.read_csv("data_train.csv")
    X = np.matrix(pd.DataFrame(trainDf, index=None,
                               columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
                                        "user_pop", "frnd_infl", "evt_pop"]))
    y = np.array(trainDf.interested)
    nrows = len(trainDf)
    kfold = KFold(nrows, 10)
    avgAccuracy = 0
    run = 0
    for train, test in kfold:
        Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
        clf = SGDClassifier(loss="log", penalty="l2")
        clf.fit(Xtrain, ytrain)
        accuracy = 0
        ntest = len(ytest)
        for i in range(0, ntest):
            yt = clf.predict(Xtest[i, :])
            if yt == ytest[i]:
                accuracy += 1
        accuracy = accuracy / ntest
        print "accuracy (run %d): %f" % (run, accuracy)
        avgAccuracy += accuracy
        run += 1
    print "Average accuracy", (avgAccuracy / run)


def test(clf):
    """
    读取test数据，用分类器完成预测
    """
    origTestDf = pd.read_csv("test.csv")
    users = origTestDf.user
    events = origTestDf.event
    testDf = pd.read_csv("data_test.csv")
    fout = open("result.csv", 'wb')
    fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")
    nrows = len(testDf)
    Xp = np.matrix(testDf)
    yp = np.zeros((nrows, 2))
    for i in range(0, nrows):
        xp = Xp[i, :]
        yp[i, 0] = clf.predict(xp)
        yp[i, 1] = clf.decision_function(xp)
        fout.write(",".join(map(lambda x: str(x),
                                [users[i], events[i], yp[i, 0], yp[i, 1]])) + "\n")
    fout.close()


clf = train()
test(clf)


# 生成要提交的文件
def byDist(x, y):
    return int(y[1] - x[1])


def generate_submition_file():
    # 输出文件
    fout = open("final_result.csv", 'wb')
    fout.write(",".join(["User", "Events"]) + "\n")
    resultDf = pd.read_csv("result.csv")
    # group remaining user/events
    grouped = resultDf.groupby("user")
    for name, group in grouped:
        user = str(name)
        tuples = zip(list(group.event), list(group.dist), list(group.outcome))
        #    tuples = filter(lambda x: x[2]==1, tuples)
        tuples = sorted(tuples, cmp=byDist)
        events = "\"" + str(map(lambda x: x[0], tuples)) + "\""
        fout.write(",".join([user, events]) + "\n")
    fout.close()


generate_submition_file()
